Packages

library(tidyverse)
library(readxl)
library(sf)
library(leaflet)

Read geocoded entries data

geocoded_entries <- read_excel(path = "geocoded_entries.xlsx", col_names = TRUE)

head(geocoded_entries)

Geocoded information

What is the proportion of addresses detected from NER which are precisely geocoded?

# general overview
geocoded_entries %>%
  select(precise.geom, entry_id) %>%
  mutate(geometry = if_else(is.na(precise.geom), FALSE, TRUE)) %>%
  group_by(geometry) %>%
  summarise(n = n()) %>%
  mutate(freq = n/sum(n)) %>%
  ungroup() %>%
  ggplot(mapping = aes(x = geometry, y = freq)) + # y as frequency
  geom_bar(stat = "identity") +
  theme_bw() +
  ggtitle("<LOC> and <CARD> elements precisely geocoded from the entries") +
  labs(caption = "ANR SoDUCo. Data: GeoHistoricalData")

# specific dates
geocoded_entries %>%
  select(precise.geom, entry_id, published) %>%
  mutate(geometry = if_else(is.na(precise.geom), FALSE, TRUE),
         published = as.character(published)) %>%
  group_by(geometry, published) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  group_by(published) %>%
  mutate(freq = n/sum(n)) %>%
  ungroup() %>%
  ggplot(mapping = aes(x = geometry, y = freq, fill = published)) + # y as frequency
  geom_bar(stat = "identity", position = "dodge") +
  theme_bw() +
  ggtitle("<LOC> and <CARD> elements precisely geocoded from the entries") +
  labs(caption = "ANR SoDUCo. Data: GeoHistoricalData")
## `summarise()` has grouped output by 'geometry'. You can override using the
## `.groups` argument.

# specific dates: n elements
geocoded_entries %>%
  select(precise.geom, entry_id, published) %>%
  mutate(geometry = if_else(is.na(precise.geom), FALSE, TRUE),
         published = as.character(published)) %>%
  group_by(geometry, published) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  ggplot(mapping = aes(x = geometry, y = n, fill = published)) + # y as frequency
  geom_bar(stat = "identity", position = "dodge") +
  theme_bw() +
  ggtitle("<LOC> and <CARD> elements precisely geocoded from the entries") +
  labs(caption = "ANR SoDUCo. Data: GeoHistoricalData")
## `summarise()` has grouped output by 'geometry'. You can override using the
## `.groups` argument.

What is the proportion of entries which are precisely geocoded?

order1 <- geocoded_entries %>%
  select(precise.geom, entry_id, order) %>%
  mutate(geometry = if_else(is.na(precise.geom), "FAUX", "VRAI")) %>%
  group_by(entry_id) %>%
  mutate(order_n = n()) %>%
  filter(order_n == 1) %>%
  ungroup() %>%
  group_by(geometry) %>%
  summarise(n = n()) %>%
  mutate(freq = n/sum(n)) %>%
  mutate(order_n = 1) %>%
  select(order_n, geometry:freq)

notorder1 <- geocoded_entries %>%
  select(precise.geom, entry_id, order) %>%
  mutate(geometry = if_else(is.na(precise.geom), "FAUX", "VRAI")) %>%
  group_by(entry_id) %>%
  mutate(order_n = n()) %>%
  filter(order_n != 1) %>% 
  ungroup() %>% 
  group_by(order_n) %>% 
  summarise(FAUX = sum(str_count(string = geometry, pattern = "FAUX")), 
            VRAI = sum(str_count(string = geometry, pattern = "VRAI"))) %>%
  pivot_longer(cols = FAUX:VRAI, names_to = "geometry", values_to = "n") %>%
  group_by(order_n) %>%
  mutate(freq = n/sum(n))

# very few entries with more than 8 localisations
order1 %>% bind_rows(notorder1) %>% filter(order_n > 8) %>% summarise(countelements = sum(n))
order1 %>%
  bind_rows(notorder1) %>% 
  filter(order_n < 8) %>%
  ggplot(mapping = aes(x = geometry, y = freq, fill = as.character(order_n))) + # y as frequency
  geom_bar(stat = "identity", position = "dodge") +
  theme_bw() +
  ggtitle("Entries precisely geocoded") +
  labs(caption = "ANR SoDUCo. Data: GeoHistoricalData")

Mapping entries: 1839

As we can see, some geocoding are far from Paris as the entries in street St-Martin located on the island.

Mapping entries: 1845

Entries only in the Ile-de-France region

Mapping entries: 1875

Mapping entries: 1904